출산률 - 국가적 차원에서의 고찰

서론

우리의 궁극적 목표

본론

1 데이터 전처리

1.1 필요 라이브러리 불러오기

suppressMessages(library(tidyverse))
suppressMessages(library(glmnet)) %>% suppressWarnings()
suppressMessages(library(dplyr)) %>% suppressWarnings()
suppressMessages(library(lm.beta)) %>% suppressWarnings()
suppressMessages(library(XML)) %>% suppressWarnings()
suppressMessages(library(httr)) %>% suppressWarnings()
suppressMessages(library(dplyr)) %>% suppressWarnings()
suppressMessages(library(stringr)) %>% suppressWarnings()
suppressMessages(library(QuantPsyc)) %>% suppressWarnings()
suppressMessages(library(car)) %>% suppressWarnings()
suppressMessages(library(sjPlot)) %>% suppressWarnings()
suppressMessages(library(Epi)) %>% suppressWarnings()
suppressMessages(library(caret)) %>% suppressWarnings()
suppressMessages(library(dvmisc)) %>% suppressWarnings()
suppressMessages(library(Metrics)) %>% suppressWarnings()
suppressMessages(library(yardstick)) %>% suppressWarnings()
suppressMessages(library(e1071)) %>% suppressWarnings()
suppressMessages(library(descr)) %>% suppressWarnings()
options(scipen = 1000)    # 옵션으로 숫자 형태 조절

1.2 데이터 불러오기

# 출산율
url <- "https://worldpopulationreview.com/country-rankings/birth-rate-by-country"
html_source <- GET(url)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F)
df_birth_rate = as.data.frame(tabs)
df_birth_rate <- df_birth_rate[,-3]
colnames(df_birth_rate) <- c("Country", "Birth rate")
df_birth_rate[,1] <- tolower(df_birth_rate[,1])
df_birth_rate$Country <- gsub(" ", "", df_birth_rate$Country)


# 결혼 나이 
url_first_marriage <- "https://en.wikipedia.org/wiki/List_of_countries_by_age_at_first_marriage"
html_source <- GET(url_first_marriage)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F)
df_first_marriage = data.frame(Country = character(), Men = numeric(), Women = numeric(), 
                               Average = numeric(), AgeGap = numeric(), AgeRatio = numeric(),
                               Year = numeric(), Source = character())
for (i in 1:5){
  table = as.data.frame(tabs[i])
  colnames(table) = table[1,]
  table = table[-1,]
  colnames(df_first_marriage) <- colnames(table)
  df_first_marriage = rbind(df_first_marriage, table)
}
df_first_marriage <- df_first_marriage[, c(1,2,3)]
colnames(df_first_marriage) <- c("Country", "Men marry", "Women marry")
df_first_marriage$Country <- tolower(df_first_marriage$Country)
df_first_marriage$Country <- gsub(" ", "", df_first_marriage$Country)


# 교육수준 
url <- "https://en.wikipedia.org/wiki/Education_Index"
html_source <- GET(url)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F)
df_study = as.data.frame(tabs[1])
colnames(df_study) = df_study[1,] ; df_study = df_study[-1,]
df_study <- df_study[, c(1, 31)]
colnames(df_study) <- c("Country", "education")
df_study$Country <- tolower(df_study$Country)
df_study$Country <- gsub(" ", "", df_study$Country)
df_study[91,1] <- "southkorea"
df_study$Country <- gsub("bosniaandherzegovina", "bosnia", df_study$Country)
df_study$Country <- gsub("unitedkingdom", "england", df_study$Country)
df_study$Country <- gsub("tanzania(unitedrepublicof)", "tanzania", df_study$Country)


# 자살률
url <- "https://en.wikipedia.org/wiki/List_of_countries_by_suicide_rate"
html_source <- GET(url)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F, header = F)
df_suicide = as.data.frame(tabs[1])
colnames(df_suicide) = df_suicide[1,] ; df_suicide = df_suicide[-1,]
df_suicide <- df_suicide[,-2]
colnames(df_suicide) <- c("Country", "Men suicide", "Women suicide")
df_suicide$Country <- tolower(df_suicide$Country)
df_suicide$Country <- gsub(" ", "", df_suicide$Country)
df_suicide$Country <- gsub("[[:punct:]]", "", df_suicide$Country)
df_suicide[22,1] <- "bosnia"
df_suicide[174,1] <- "england"


# 육아비용 
url <- "https://www.numbeo.com/cost-of-living/prices_by_country.jsp?displayCurrency=USD&itemId=224"
html_source <- GET(url)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F)
df_childcost = as.data.frame(tabs[2])
df_childcost = df_childcost[,-1]
colnames(df_childcost) <- c("Country", "child cost")
df_childcost$Country <- tolower(df_childcost$Country)
df_childcost$Country <- gsub(" ", "", df_childcost$Country)
df_childcost[45,1] <- "england"
df_childcost[61,1] <- "bosnia"


# 65세 이상 인구 비율
url <- "https://en.wikipedia.org/wiki/List_of_countries_by_age_structure"
html_source <- GET(url)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F)
df_oldratio = as.data.frame(tabs[1])
colnames(df_oldratio) = df_oldratio[1,] ; df_oldratio = df_oldratio[-1,]
df_oldratio = df_oldratio[-1,c(-2:-3)]
colnames(df_oldratio) <- c("Country", "old ratio")
df_oldratio$Country <- tolower(df_oldratio$Country)
df_oldratio$Country <- gsub(" ", "", df_oldratio$Country)
df_oldratio$Country <- gsub("bosniaandherzegovina", "bosnia", df_oldratio$Country)
df_oldratio$Country <- gsub("republicofthecongo", "congo", df_oldratio$Country)
df_oldratio$Country <- gsub("unitedkingdom", "england", df_oldratio$Country)


# 행복지수 
df_happiness <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/ahyoung/%EB%82%98%EB%9D%BC%EB%B3%84%20%ED%96%89%EB%B3%B5%EC%A7%80%EC%88%98.csv")
df_happiness <- df_happiness[,2:3]
colnames(df_happiness) <- c("Country", "happy score")
df_happiness$Country <- tolower(df_happiness$Country)
df_happiness$Country <- gsub(" ", "", df_happiness$Country)
df_happiness$Country <- gsub("bosniaandherzegovina", "bosnia", df_happiness$Country)
df_happiness$Country <- gsub("congo(brazzaville)", "congo", df_happiness$Country)
df_happiness$Country <- gsub("unitedkingdom", "england", df_happiness$Country)
df_happiness$Country <- gsub("southsudan", "sudan", df_happiness$Country)


# 지니계수 
df_gini <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/final_report/data_set/%EC%A7%80%EB%8B%88%EA%B3%84%EC%88%98.csv", fileEncoding = 'euc-kr')
df_gini[] <- lapply(df_gini, function(x){gsub("-", NA, x)})
rownames(df_gini) <- df_gini[,1]
df_gini[] <- lapply(df_gini, function(x){as.numeric(x)})
gini_mean <- round(rowMeans(df_gini, na.rm = T),2)
df_gini <- cbind(df_gini, gini_mean)
df_gini[,1] <- rownames(df_gini)
df_gini <- df_gini[,c(1,12)]
colnames(df_gini) <- c("Country", "gini_score")
df_gini$Country <- tolower(df_gini$Country)
df_gini$Country <- gsub(" ", "", df_gini$Country)


# 낙태 합법인 국가 
df_abortionlegal <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/abortionlegal.csv", header=FALSE)
df_abortionlegal <- df_abortionlegal[-219,-3]
df_abortionlegal <- as.data.frame(df_abortionlegal)
colnames(df_abortionlegal) = c("Country", "legal")
df_abortionlegal$legal[df_abortionlegal$legal != 'yes'] <- 'no'
df_abortionlegal$Country <- tolower(df_abortionlegal$Country)
df_abortionlegal$Country <- gsub(" ", "", df_abortionlegal$Country)


# 많은 데이터 
vary <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/final_report/data_set/%E1%84%82%E1%85%A1%E1%84%85%E1%85%A1%E1%84%83%E1%85%A6%E1%84%8B%E1%85%B5%E1%84%90%E1%85%A5.csv", header = FALSE)
colnames(vary) <- vary[1,]
vary <- vary[-1,]
df_vary <- as.data.frame(vary)
colnames(df_vary)[1] <- c("Country")
df_vary$Country <- tolower(df_vary$Country)
df_vary$Country <- gsub(" ", "", df_vary$Country)
df_vary$Country <- gsub("[[:punct:]]", "", df_vary$Country)
df_vary <- df_vary[,c(-2,-40,-41)]


# GDP
url <- "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
html_source <- GET(url)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F)
df_gdp = as.data.frame(tabs[3])
df_gdp <- df_gdp[5:194,-3]
colnames(df_gdp) <- c("Country", "gdp")
df_gdp$Country <- tolower(df_gdp$Country)
df_gdp$Country <- gsub(" ", "", df_gdp$Country)
df_gdp$Country <- gsub("[[:punct:]]", "", df_gdp$Country)
df_gdp[2,1] <- "china"
df_gdp$Country <- gsub("bosniaandherzegovina", "bosnia", df_gdp$Country)
df_gdp$Country <- gsub("solomonislands", "island", df_gdp$Country)
df_gdp$Country <- gsub("lebanon2020", "lebanon", df_gdp$Country)
df_gdp$Country <- gsub("moldovan8", "moldova", df_gdp$Country)
df_gdp$Country <- gsub("ukrainen5", "ukraine", df_gdp$Country)
df_gdp$Country <- gsub("unitedkingdom", "england", df_gdp$Country)
df_gdp$gdp<- gsub(",", "", df_gdp$gdp)


# 나라별 평등지수 
url <- "https://en.wikipedia.org/wiki/Global_Gender_Gap_Report"
html_source <- GET(url)
tabs <- readHTMLTable(rawToChar(html_source$content), stringsAsFactors=F)
df_equality = as.data.frame(tabs[3])
df_equality <- df_equality[-1,]
df_equality <- df_equality[-1,c(1,17)]
colnames(df_equality) = c("Country", "equality")
df_equality$Country <- tolower(df_equality$Country)
df_equality$Country <- gsub(" ", "", df_equality$Country)
df_equality$Country <- gsub("[[:punct:]]", "", df_equality$Country)
df <- full_join(df_gini, df_equality, by = 'Country')
df_equality$Country <- gsub("korearep", "southkorea", df_equality$Country)
df_equality$Country <- gsub("kyrgyzrepublic", "kyrgyzstan", df_equality$Country)
df_equality$Country <- gsub("bosniaandherzegovina", "bosnia", df_equality$Country)
df_equality$Country <- gsub("unitedkingdom", "england", df_equality$Country)
df_equality$Country <- gsub("democraticrepublicofthecongo", "congo", df_equality$Country)


# OECD 국가
df_oecd <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/final_report/data_set/OECD%20country.csv")
df_oecd <- cbind(df_oecd, 1)
colnames(df_oecd) <- c("Country", "OECD")

1.3 데이터 합치기, 이름바꾸기

df <- inner_join(df_gini, df_abortionlegal, by = 'Country')
df <- inner_join(df, df_study, by = 'Country')
df <- inner_join(df, df_oldratio, by='Country')
df <- inner_join(df, df_happiness, by='Country')
df <- inner_join(df, df_childcost, by='Country')
df <- inner_join(df, df_suicide, by = "Country")
df <- inner_join(df, df_birth_rate, by = "Country")
df <- inner_join(df, df_gdp, by = 'Country')
df <- inner_join(df, df_equality, by='Country')
df <- inner_join(df, df_vary, by='Country')
df$Country <- str_to_title(df$Country)

colnames(df)[13:58] <- c("surface area", "population in thousands", "population density",
                         "sex ratio", "gdp_", "gdp growth rate", "gdp per capita", "agriculture",
                         "industry","services and other activity", "agriculture employed",
                         "industry employed","service employed", "unemployed", "labour force",
                         "agriculture production", "food production","exports", "imports",
                         "trade balance", "balance of paments", "population growth rate", "urban population",
                         "urban population growth rate", "fertility rate", "life expectancy at birth", 
                         "population age distribution", "international migrant", "refugees",
                         "infant mortality rate","health expenditure", "physicians",
                         "education government expenditure", "primary education", 
                         "secondary education", "tertiary education",
                         "women politician", "internet using", "threatened species",
                         "forested area", "CO2 emission", "energy production", "energy supply",
                         "water", "sanitation facilities", "net official")

1.4 데이터 변형

# 낙태 합법 변수 재부호화
df$legal <- ifelse(df$legal == 'yes', 1, 0)

# 65 세 이상 변수 퍼센트 없애기
df$`old ratio` <- gsub("[[:punct:]]", "", df$`old ratio`)     # %없애주기
df$`old ratio` <- gsub("\u00A0", "", df$`old ratio`)          # 나머지 부분 없애주기

# 숫자형으로 변환
rownames(df) <- df$Country
df <- df[,-1]  # 나라이름 없애기
df[] <- lapply(df, function(x){as.numeric(x)})
no_na <- colSums(is.na(df))
na_col_list = vector()
for (i in 1:length(df)){
  if (no_na[[i]] != 0){
    na_col_list = append(na_col_list, i)
  }
}
df <- df[,na_col_list*-1]
df <- df[,-41]
df <- df[,c(-16, -32)]

# 표준화
df_std <- as.data.frame(scale(df))
head(sapply(df_std, sd))     # 표준편차
##  gini_score       legal   education   old ratio happy score  child cost 
##           1           1           1           1           1           1
head(sapply(df_std, mean))   # 평균
##                gini_score                     legal                 education 
## 0.00000000000000035060926 0.00000000000000007914230 0.00000000000000018397426 
##                 old ratio               happy score                child cost 
## 0.00000000000000001807053 0.00000000000000027642254 0.00000000000000004155316
# 다중공선성에 걸리는 변수 삭제
vif_model <- lm(`Birth rate` ~ ., data = df_std)
vif_value <- vif(vif_model) > 5
df_vif <- df_std[, vif_value]

corrplot::corrplot(cor(df_vif))

# 다중공선성 확인 그래프 하나 추가

2 국가별 출산율 분석

2.1 다중선형 회귀분석

전진 선택법 사용하여 AIC가 낮아지는 쪽으로 변수를 선택하는 방법이다.

df_regression <- lm(`Birth rate` ~ ., data=df_vif)
df_regression_null <- lm(`Birth rate` ~ 1, data=df_vif)

step_df <- step(df_regression_null, scope=list(lower=df_regression_null, upper=df_regression),
                direction="forward")
summary(step_df) # 요약본 보기
## 
## Call:
## lm(formula = `Birth rate` ~ `old ratio` + `infant mortality rate` + 
##     `industry employed` + gini_score + `Women suicide`, data = df_vif)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34898 -0.26033  0.00917  0.24655  1.42549 
## 
## Coefficients:
##                                         Estimate               Std. Error
## (Intercept)             -0.000000000000000009082  0.051711514739097380611
## `old ratio`             -0.491333196552296014303  0.083993501847600113286
## `infant mortality rate`  0.448512855986892722271  0.077955158697127033274
## `industry employed`     -0.144006310782061580644  0.056417257911671542248
## gini_score              -0.126978565252482467063  0.059534202151918375057
## `Women suicide`         -0.084507832070379720890  0.055480500329053147879
##                         t value    Pr(>|t|)    
## (Intercept)               0.000      1.0000    
## `old ratio`              -5.850 0.000000162 ***
## `infant mortality rate`   5.753 0.000000237 ***
## `industry employed`      -2.553      0.0130 *  
## gini_score               -2.133      0.0366 *  
## `Women suicide`          -1.523      0.1324    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4418 on 67 degrees of freedom
## Multiple R-squared:  0.8183, Adjusted R-squared:  0.8048 
## F-statistic: 60.37 on 5 and 67 DF,  p-value: < 0.00000000000000022
round(lm.beta(step_df), 3)    # 표준화 계수확인 
##             `old ratio` `infant mortality rate`     `industry employed` 
##                  -0.491                   0.449                  -0.144 
##              gini_score         `Women suicide` 
##                  -0.127                  -0.085

2.2 검증

vif(step_df) # 다중공선성 확인
##             `old ratio` `infant mortality rate`     `industry employed` 
##                2.602115                2.241428                1.173976 
##              gini_score         `Women suicide` 
##                1.307279                1.135314
# sjplot
set_theme(axis.title.size = 1.0, axis.textsize = 1.0)
plot_model(step_df, type = "diag", wrap.labels=5)
## [[1]]

## 
## [[2]]
## `geom_smooth()` using formula 'y ~ x'

## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula 'y ~ x'

par(mfrow = c(2,2))# plot 4개를 동시에 표시하기 위함
plot(step_df)

par(mfrow = c(1,1))

2.3 Elasticnet 분석

glmnet이라는 패키지를 이용하면, 단순히 정규화 뿐만 아니라 모델의 복잡도도 고려하여 회귀 모델을 만들 수 있다. 그래서 이번엔 elastic을 이용해 보겠다.

set.seed(1234) # 시드설정 
df_elastic <- cv.glmnet(as.matrix(df_vif[,-7]),
                        df_vif$`Birth rate`,
                        family = "gaussian", alpha = .5,
                        nfolds = 4, type.measure = "mse")

plot(df_elastic)      # 최적의 lambda값 확인

log(df_elastic$lambda.min)     # MSE가 가장 작을때의 lambda
## [1] -1.460013
log(df_elastic$lambda.1se)     # 가장 작을때 1표준편차 안에서 가장 간결한 모델일 때 lambda.
## [1] -0.9018106
plot(df_elastic$glmnet.fit, xvar = "lambda")   # 변수가 추가될 때마다 모수 추정값의 변화를 볼 수 있다.

coef.elastic <- coef(df_elastic, s = "lambda.min")[,1] 
coef.elastic       # 계수확인 
##                 (Intercept)                  gini_score 
## -0.000000000000000007108946  0.000000000000000000000000 
##                   education                   old ratio 
## -0.123567078742224403797856 -0.334688891616295614017673 
##                 happy score                 Men suicide 
##  0.000000000000000000000000  0.000000000000000000000000 
##               Women suicide                         gdp 
##  0.000000000000000000000000  0.000000000000000000000000 
##                surface area             gdp growth rate 
##  0.000000000000000000000000  0.000000000000000000000000 
##              gdp per capita                 agriculture 
##  0.000000000000000000000000  0.000000000000000000000000 
##                    industry services and other activity 
##  0.000000000000000000000000  0.000000000000000000000000 
##        agriculture employed           industry employed 
##  0.000000000000000000000000 -0.057301106232413190344488 
##                  unemployed      agriculture production 
##  0.000000000000000000000000  0.001137891967815737306532 
##             food production                     exports 
##  0.000000000000000000000000  0.000000000000000000000000 
##                     imports               trade balance 
##  0.000000000000000000000000  0.000000000000000000000000 
##                    refugees       infant mortality rate 
##  0.000000000000000000000000  0.301933059415346150000659 
##          threatened species                CO2 emission 
## -0.002939162053789053444253  0.000000000000000000000000 
##           energy production 
##  0.000000000000000000000000
mse.min.elastic <- df_elastic$cvm[df_elastic$lambda == df_elastic$lambda.min]
mse.min.elastic    # elestic 모델에서 최적의 mse값
## [1] 0.2763587
r2.min.elastic <- df_elastic$glmnet.fit$dev.ratio[df_elastic$lambda == df_elastic$lambda.min]
r2.min.elastic     # elestic 모델에서 최적의 R-squared값 
## [1] 0.7699512
# step_df 모델 수치
step_df_mse <- sum( (step_df$residuals)^2 )/nrow(df_vif)
step_df_mse
## [1] 0.1791634

2.4 Twitter 분석

#install.packages("remotes")
#remotes::install_github("haven-jeon/KoNLP", upgrade = "never", INSTALL_opts = c("--no-multiarch"))
library(syuzhet)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(stringr)
library(rtweet)
## 
## Attaching package: 'rtweet'
## The following object is masked from 'package:syuzhet':
## 
##     get_tokens
## The following object is masked from 'package:purrr':
## 
##     flatten
library(dplyr)
library(twitteR)
## 
## Attaching package: 'twitteR'
## The following object is masked from 'package:rtweet':
## 
##     lookup_statuses
## The following objects are masked from 'package:dplyr':
## 
##     id, location
library(dplyr)
library(RColorBrewer)
library(KoNLP)
## Checking user defined dictionary!
library(SnowballC)
library(ggplot2)
library(SentimentAnalysis)
## 
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
## 
##     write
library(tidyverse)
useSejongDic()
## Backup was just finished!
## 370957 words dictionary was built.
#source(file = "TwitterAPIKey.R", echo = FALSE)

#childbirth <- enc2utf8("childbirth")
#data <- search_tweets(childbirth, n=10000, lang="en")
#data <- unique(data)
#data <- data[,c(1,2,3,4,5,6,7,67)]
#write_csv(data, "childbirthdata.csv")

data <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/final_report/data_set/childbirthdata.csv")

text <- str_replace_all(data$text, "@\\w+", "")
text <- str_replace_all(data$text, "[[:punct:]]", "") 
wordCorpus <- Corpus(VectorSource(text))
wordCorpus <- tm_map(wordCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(wordCorpus, content_transformer(tolower)):
## transformation drops documents
wordCorpus <- tm_map(wordCorpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(wordCorpus, removeWords, stopwords("english")):
## transformation drops documents
wordCorpus <- tm_map(wordCorpus, removeWords, c("https"))
## Warning in tm_map.SimpleCorpus(wordCorpus, removeWords, c("https")):
## transformation drops documents
wordCorpus <- tm_map(wordCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(wordCorpus, stripWhitespace): transformation
## drops documents
wordCorpus <- tm_map(wordCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(wordCorpus, removePunctuation): transformation
## drops documents
wordCorpus <- tm_map(wordCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(wordCorpus, removeNumbers): transformation drops
## documents
wordCorpus <- tm_map(wordCorpus, removeWords, c("amp"))
## Warning in tm_map.SimpleCorpus(wordCorpus, removeWords, c("amp")):
## transformation drops documents
wordCorpus <- tm_map(wordCorpus, removeWords, c("childbirth"))
## Warning in tm_map.SimpleCorpus(wordCorpus, removeWords, c("childbirth")):
## transformation drops documents
wordCorpus <- tm_map(wordCorpus, removeWords, c("nhs", "realdonaldtrump", "tcoqzrnig"))
## Warning in tm_map.SimpleCorpus(wordCorpus, removeWords, c("nhs",
## "realdonaldtrump", : transformation drops documents
wordCorpus[[1]]$content
## [1] "twopacsugar pressivx awww little guy yep  physical trauma mother child"
pal <- brewer.pal(8, "Dark2")
wordcloud(words=wordCorpus, scale=c(4,0.5), max.words = 300,
          random.order = F, rot.per=0.35, use.r.layout = F, min.freq = 50, colors=pal)

encodesentiment <- function(x) {
  if(x <= -1){
    "1) very negative"
  }else if(x > -1 & x < 0){
    "2) negative"
  }else if(x > 0 & x < 1){
    "4) positive"
  }else if(x >= 1){
    "5) very positive"
  }else {
    "3) neutral"
  }
}

tweetsentiment <- get_sentiment(data$text, method= "syuzhet")
tweets <- cbind(data, tweetsentiment)
tweets$Sentiments <- sapply(tweets$tweetsentiment, encodesentiment)

qplot(tweets$tweetsentiment) + theme(legend.position = "none")+
  xlab("sentiment score") + ylab("number of tweets") +
  ggtitle("childbirth tweets by sentiment score")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(tweets, aes(Sentiments)) +
  geom_bar(fill="orange") +
  theme(legend.position = "none", axis.title = element_blank())+
  ylab("number of tweets") +
  ggtitle("tweets about childbirth")

# =============================================

#parenting <- enc2utf8("parenting")
#data2 <- search_tweets(parenting, n=10000, lang="en")
#data2 <- unique(data2)
#data2 <- data2[,c(1,2,3,4,5,6,7,67)]
#write_csv(data2, "parentingdata.csv")

data2 <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/final_report/data_set/parentingdata.csv")

text2 <- str_replace_all(data2$text, "@\\w+", "")
text2 <- str_replace_all(data2$text, "[[:punct:]]", "")
wordCorpus2 <- Corpus(VectorSource(text2))
wordCorpus2 <- tm_map(wordCorpus2, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(wordCorpus2, content_transformer(tolower)):
## transformation drops documents
wordCorpus2 <- tm_map(wordCorpus2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(wordCorpus2, removeWords, stopwords("english")):
## transformation drops documents
wordCorpus2 <- tm_map(wordCorpus2, removeWords, c("https"))
## Warning in tm_map.SimpleCorpus(wordCorpus2, removeWords, c("https")):
## transformation drops documents
wordCorpus2 <- tm_map(wordCorpus2, stripWhitespace)
## Warning in tm_map.SimpleCorpus(wordCorpus2, stripWhitespace): transformation
## drops documents
wordCorpus2 <- tm_map(wordCorpus2, removePunctuation)
## Warning in tm_map.SimpleCorpus(wordCorpus2, removePunctuation): transformation
## drops documents
wordCorpus2 <- tm_map(wordCorpus2, removeNumbers)
## Warning in tm_map.SimpleCorpus(wordCorpus2, removeNumbers): transformation drops
## documents
wordCorpus2 <- tm_map(wordCorpus2, removeWords, c("parenting"))
## Warning in tm_map.SimpleCorpus(wordCorpus2, removeWords, c("parenting")):
## transformation drops documents
wordCorpus2 <- tm_map(wordCorpus2, removeWords, c("tcolukpux", "yoongi"))
## Warning in tm_map.SimpleCorpus(wordCorpus2, removeWords, c("tcolukpux", :
## transformation drops documents
wordCorpus2 <- tm_map(wordCorpus2, removeWords, c("httpstcolukpux"))
## Warning in tm_map.SimpleCorpus(wordCorpus2, removeWords, c("httpstcolukpux")):
## transformation drops documents
wordCorpus2 <- tm_map(wordCorpus2, removeWords, c("amp"))
## Warning in tm_map.SimpleCorpus(wordCorpus2, removeWords, c("amp")):
## transformation drops documents
wordCorpus2[[1]]$content
## [1] "sadhguru article published mumforce blog engaging sadhguru article featuring tips earn childrens friendship published  blog mumforce click link check httpstcovzvbwdps httpstcovcvdfuun"
set.seed(1234)
wordcloud(words=wordCorpus2, scale=c(4,0.5), max.words = 200,
          random.order = F, rot.per=0.35, use.r.layout = F, min.freq = 10, colors=pal)

# 감정분석 

tweetsentiment2 <- get_sentiment(data2$text, method= "syuzhet")
tweets2 <- cbind(data2, tweetsentiment2)
tweets2$Sentiments2 <- sapply(tweets2$tweetsentiment2, encodesentiment)

qplot(tweets2$tweetsentiment2) + theme(legend.position = "none")+
  xlab("sentiment score") + ylab("number of tweets") +
  ggtitle("parenting tweets by sentiment score")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(tweets2, aes(Sentiments2)) +
  geom_bar(fill="aquamarine4") +
  theme(legend.position = "none", axis.title = element_blank())+
  ylab("number of tweets") +
  ggtitle("tweets about parenting")

# =====================================================


#childbirth_k <- enc2utf8("출산")
#Kdata <- search_tweets(childbirth_k, n=10000, lang="ko")
#Kdata <- unique(Kdata)
#Kdata <- Kdata[,1:13]
#write_csv(Kdata, "출산.csv")
Kdata <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/final_report/data_set/%E1%84%8E%E1%85%AE%E1%86%AF%E1%84%89%E1%85%A1%E1%86%AB.csv")

Kdata_text <- Kdata$text
Kdata_text <- sapply(Kdata_text, extractNoun, USE.NAMES = F)
## Warning in value[[3L]](cond): can't processing '하... 너무슬퍼
## 콧김콧물눈물출산동시분출 💦'.
## Warning in value[[3L]](cond): can't processing '@funfun_haja 엄마되심에 축하드려요🥳🥳🥳🥳
## 무사 출산?? 을 빕니다😊😊😊'.

## Warning in value[[3L]](cond): can't processing '@funfun_haja 엄마되심에 축하드려요🥳🥳🥳🥳
## 무사 출산?? 을 빕니다😊😊😊'.
## Warning in value[[3L]](cond): can't processing '@jjin6n7 저는 ㄷㄹㅁ 출산 까
## 지 ... 진짜 망상녀라고 생각했는데 다들 같은 생각 이셨어 😊'.
## Warning in value[[3L]](cond): can't processing '@noll091912 그러게요 임신 출산
## 유산 생리……. 이건또 문제가 되나봐요🥲'.
## Warning in value[[3L]](cond): can't processing '@stuckyi_garden 스투키님 축하드려요🥳🥳🥳🥳🥳
## 엄마가 되는 과정이 참 쉽진 않지만
## 항상 건강 잘 챙기시고 무사히 출산하길 바랄게요🙏🙏  축하드려요💞💞'.
## Warning in value[[3L]](cond): can't processing '@heekyswi 저도 출산하고 요실금
## 때문에 몇번 실수 했어요 ㅠㅠ 흑흑 시간 지나면 괜찮아질거라고 생각하고싶었는데 😭
## 😭'.
## Warning in value[[3L]](cond): can't processing '난 몰라ㅠ 화장실에서 내편 이발해주다가 방귀가 뽀롱! 나와버렸다😭 출산 후 아직 괄약근에 힘이 잘 안 들어가서 그만 실수를.. 순간 "어머!"하고 얼굴이 새빨개졌더나 방귀 꼈냐고ㅜ 고개 끄덕끄덕...
## 나 배려해서 아기 따라하는 거냐고 괜찮다며 안 웃고 태연하게 넘어가주는데 너무 창피해....😭'.
## Warning in value[[3L]](cond): can't processing '〈이승만 정권의 해외입양은 '혼혈아 청소'였다〉라는 《프레시안》 기사 읽다가 역해졌다. 너무 성차별적이고, 여성혐오적이네.
## 
## 한국남자들은 한국 여자가 한국남자 없이 낳은 후세를 견디지 못한다. 그러니 2021년에도 정자은행을 등을 이용한 비혼 출산에 "반대한다"며 난리 치는 거겠지. 🤮'.
## Warning in value[[3L]](cond): can't processing '내친김에 말하면 4050 아들가진 엄
## 마들 중에서 사유리에 대해 치를떠는 사람들 정말 많음 ㅋㅋ 어떻게 여자가 남편도 얻
## 지 않고 애를 가질 수 있냐면서 그런 사람이 공영방송에 나와서 비혼출산을 장려하면
## 6가족질서가 붕괴9될 거라면서 당장 퇴출시켜야 한다고 열변하던 거 생각나면 비웃겨
## 죽겠음🤪'.
## Warning in value[[3L]](cond): can't processing '@trysmilehahaha @prometh77
## @lumimuse 기본적으로 인스타가 뭐든 좋게 포장해서 보여주려는 경향도 있는데 거기에
## 방송인이니 더 그런 것도 있겠죠. 그럼에도 비슷한 시기에 출산한 다른 방송인과 비교
## 해서 그 분은 좀 너무 과하다 싶을 정도더라구요 ㅋㅋㅋㅋㅋ 이제 백일 지났던데.. 돌
## 까지 많이 즐기셨으면.. 😌'.
## Warning in value[[3L]](cond): can't processing '@jjimny13 튼살크림 영양제 좋은데
## 요ㅎㅎ 저도 아이엄마지만 출산 때 엄마 선물은 한번도 못 받아봤어요ㅠ ㅠ 엄마 선물
## 이 젤 최곤데 보통 아이 선물 만 해주죠😭😭'.
## Warning in value[[3L]](cond): can't processing '친한 친구가 곧 임신 8개월차로 접어드는데
## 출산 선물로 어떤게 좋을까요...!?  아들이래요😊  경험자님들의 조언 부탁드립니당💛'.
## Warning in value[[3L]](cond): can't processing '아 근데 나 백신 부작용인가. 생리
## 끝난지 이주일 쯤 됐는데 부정출혈이 있고 얼굴에 모기 물린 자국이 두어군데 생김.
## 출산한지 얼마 안되서 생리 두번하나 보다 했는데 너무 둔했다. 병원가기 완전 귀찮은
## 데 😇'.
## Warning in value[[3L]](cond): can't processing '@trustop_nn 그❗️쵸❗️이현이 이련
## 이가 임신➡️출산 했어도 이련이밖에 모르고 이련이도 그거 알고 있고 그런 게 넘 조아
## 요 엉엉 평생 행복해🥺'.
## Warning in value[[3L]](cond): can't processing '@eeboau 으아 보리님도 ㅠㅠㅠㅠㅠ
## 전 출산 직후 심하다가 괜찮아졌었는데 최근 다시 심해졌어요 ㅠㅠ 이번주에만 두번이
## 나… 하 정말 미치겠네요 그쵸…. 이게 뭔일이야 정말 😞'.
## Warning in value[[3L]](cond): can't processing '@rynnaSY 대박 저도 돌고돌아 저도
## 흙침대예요ㅋㅋㅋ 요통인은 요통인의 심정을 알죵~ 저도 디스크있어서 후후. 임신 중
## 이라 생으로 견디는데 출산하면 병원 문지방 닳게 다녀야겠네요😅'.
## Warning in value[[3L]](cond): can't processing '@malimaliboo 읽다만 출산기 읽고
## 👀 캔디윰윰 읽으려구요!👀🍬'.
## Warning in value[[3L]](cond): can't processing '@robotaev ㅊㅁ)안녕하세요 로보
## 트태권브이님 가뜩이나 정신없는데 이상한 분까지 만나서 고생이 많으시네요ㅠㅠ 저도
## 최근에 출산했어가지구 그냥 지나칠수가 없어 댓 남기고 가요 다음 분은 꼭 좋은분이
## 시길..🙏'.
## Warning in value[[3L]](cond): can't processing '@flashbeauty0525 ?? 아 그런일도 있네요 .ㅠ 
## 출산후 와주신분은 첫째 영양 부족한거 같다며 요구르트 손수 만들어 오시고 시장봐오시고 돈도 안받으시고 음식해주셨어요 정말 감사, 감동 .. 
## 네 그래야겠어요🙏  
## 근데 바로 옆집사세요..ㅋㅋㅋ 이웃주민😅'.
## Warning in value[[3L]](cond): can't processing '@songonyusuke @cx_r @yujuiiiii
## @SamYu_Honey 출산하고 오시기 바랍니다???????ㅋㅋㅋ🤪😝'.
## Warning in value[[3L]](cond): can't processing '@min__hoy 출산 직전까지 가진통
## 많이 느끼시겠네요ㅋㅋㅋㅋㅋ 제일 예쁜 모습일때 👍👍🤣'.
## Warning in value[[3L]](cond): can't processing '주변에 임신 출산이 갑자기 빈번해
## 짐. 다들 축하축하🎉🎉'.
## Warning in value[[3L]](cond): can't processing '@MozoriMoo @edzROuS5mVF0B33 출산 가산점이라!! 
## 가산점 산정 방법, 사용처 등 고민할 것이 많지만 
## 출산률 높일 수 있는 좋은 방법이라고 생각합니다. 👍
## 덤으로 꼴페미들도 속아낼 수 있으니 도랑치고 가재잡는 격이네요.😆'.
## Warning in value[[3L]](cond): can't processing '@minwoon 출산 가산점을 만들어서
## 일반 여성과 페미 매국노들을 대립 시켜야죠 😂🤣'.
## Warning in value[[3L]](cond): can't processing '@MozoriMoo @edzROuS5mVF0B33 출산 가산점이라!! 
## 가산점 산정 방법, 사용처 등 고민할 것이 많지만 
## 출산률 높일 수 있는 좋은 방법이라고 생각합니다. 👍
## 덤으로 꼴페미들도 속아낼 수 있으니 도랑치고 가재잡는 격이네요.😆'.
## Warning in value[[3L]](cond): can't processing '@minwoon 출산 가산점을 만들어서
## 일반 여성과 페미 매국노들을 대립 시켜야죠 😂🤣'.
## Warning in value[[3L]](cond): can't processing '친구가 곧 넷째를 출산한다고… 그
## 저 리스펙!🙌'.
## Warning in value[[3L]](cond): can't processing '@Amethyst_btstwt 아메님 33시간이
## 라뇨…고생하셨어요 ㅠㅠㅠㅠㅠ진짜 출산후기보면..너무 살떨려요ㅠㅠ 너무 대단하신분
## 들. 아가의 다섯번째 생일을 진심으로 축하드려요!! 아메님도 아가도 건강하셔야해요
## 💛'.
## Warning in value[[3L]](cond): can't processing '@Amethyst_btstwt 끝이다! 라니 새
## 로운 출산 후기네요ㅎ 과거의 아메 님 고생하셨습니다! 아이는 생일 축하합니다~ 💜'.
## Warning in value[[3L]](cond): can't processing '@steve55_grum 그불샘 축하드려요!!!🥳🎉🎊
## 몸조리 잘하시구 아무쪼록 그불샘도 아이도 건강한 출산이 되시길요🙏
## 그불샘이 행복해야 아이도 행복🤍💚그러니 태교는 브윈☀🐰'.
## Warning in value[[3L]](cond): can't processing '@bjd_dj 아고 검날님~출산준비만큼
## 설레고 행복한 순간이 없지요😊😊출산준비 잘 마무리 되시길 바랍니다😍'.
## Warning in value[[3L]](cond): can't processing '@steve55_grum 오마갓 너무 축하드
## 려요💚🤍 우리애들처럼 착하고 잘생기고 이쁜 아가출산하세요👏👏👏 언제든지 몸건강
## 하게 돌아오세용💚🤍💚🤍'.
## Warning in value[[3L]](cond): can't processing '@bjd_dj 검날님! 건강히 출산하시
## 길 바래요! 이것저것 바쁘시겠네요 한참 ☺ 항상 행복만 가득하시기를!!💗💕'.
## Warning in value[[3L]](cond): can't processing '@bjd_dj 몸도 맘도 건강히 출산하
## 시기 바랍니다..!! 검날님 화이팅!!😚😤'.
## Warning in value[[3L]](cond): can't processing '저...아기 출산한거 저 아니고 사
## 유리 언니에요. 축하문자 가끔 받는데 웃픔🤭 사유리언니 축하해요😍'.
## Warning in value[[3L]](cond): can't processing '여러분 혹시 아가들이 쓸 수 있는
## 무지 손수건은 어디서 구입할 수 있을까요? 곧 출산을 할 친구에게 수를 놓아 선물하
## 고 싶어요. 😌🐥🍼👼'.
## Warning in value[[3L]](cond): can't processing '@bjd_dj 막달 건강하고 안전하게
## 보내세요 ! 💖💖 아기 출산도 순산하시길 바랍니다🥺💖🥺💖🥺💖 저도 이제 생후 1년
## 다되가는 초보엄마라 😭 예비초보엄마 화이팅이에요 💪💪💪'.
## Warning in value[[3L]](cond): can't processing '@perhaps_bjd 같이 불출산에 올라
## 보아요😉❤️ 레몬 귀여운데 하나 하실래요??🥰'.
## Warning in value[[3L]](cond): can't processing '@NS_JUNG 오늘 본 다큐가 탄생에
## 관한 것이라 임신에 대한 부분이 많네요. 남자로 성전환한 트랜스젠더가 아기를 출산
## 하는 장면이 나왔어요. 수염이 시커면 남자가 제왕절개로 아이를 낳았어요. 너무 놀라
## 운 세상이예요. 🙀'.
## Warning in value[[3L]](cond): can't processing '@btssuga123 달달님 출산하셔써염? 😳 
## 아가야 두시간만 푹 자주라..😅'.
Kdata_word <- unlist(Kdata_text)
Kdata_word <- Filter(function(x){nchar(x)>=2},Kdata_word)

Kdata_word <- gsub("\n", "", Kdata_word)
Kdata_word <- gsub("\r", "", Kdata_word)
Kdata_word <- gsub("https://", "", Kdata_word)
Kdata_word <- gsub("@", "", Kdata_word)
Kdata_word <- gsub("[[:punct:]]", "", Kdata_word)
Kdata_word <- str_replace_all(Kdata_word,"[A-Za-z0-9]","")
Kdata_word <- gsub("[ㄱ-ㅎ]", "", Kdata_word)
Kdata_word <- gsub("[ㅜ|ㅠ]", "", Kdata_word)
Kdata_word <- gsub("로", "", Kdata_word)
Kdata_word <- gsub("합사시켜", "", Kdata_word)
Kdata_word <- gsub("해서", "", Kdata_word)
Kdata_word <- gsub("출산", "", Kdata_word)
Kdata_word <- gsub("쿠오모가", "", Kdata_word)
Kdata_word <- gsub("까지", "", Kdata_word)
Kdata_word <- gsub("들이", "", Kdata_word)
Kdata_word <- gsub("그래서", "", Kdata_word)
Kdata_word <- gsub("그걸", "", Kdata_word)

head(Kdata_word)
## [1] "암수"   "마리"   "물리적" "연결"   "혈액"   "공유"
set.seed(1234)
wordcloud(words=Kdata_word, scale=c(5,0.5),
          min.freq = 30, random.order = F, max.words=200, family="AppleGothic", colors=pal)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 응
## 급상황 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 비
## 행기 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 바
## 이크 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 소
## 유자 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 활
## 주 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 페
## 루 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 때
## 문 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 인
## 간 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 육
## 아 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 품
## 종 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 착
## 취 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 사
## 회 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 선
## 택 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 직
## 원 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 가
## 족 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 최
## 초 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 미
## 안 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 원
## 전 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 주
## 택용 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 마
## 리 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 보
## 호 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 작
## 업 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 인
## 공 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 진
## 보진영 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 물
## 리적 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 호
## 르몬 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 암
## 수 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 이
## 식임신 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 자
## 신 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 당
## 연 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 의
## 원 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 강
## 요 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 이
## 름 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 결
## 합 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 생
## 각 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 성
## 소수자 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 권
## 리 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 기
## 초 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 도
## 덕 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 국
## 회의원 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 하
## 게 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 정
## 치 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 기
## 부 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 네
## 이버 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 이
## 상 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 자
## 기 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 새
## 끼 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 자
## 유 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 반
## 대 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 계
## 층 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 중
## 상 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 문
## 제 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 용
## 혜인 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 엄
## 마 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 청
## 년 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 행
## 태 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 진
## 행 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 사
## 고 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 궁
## 핍 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 임
## 신괴 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 정
## 책 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 방
## 패 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 여
## 자 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 저
## 소득층 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 전
## 기요금 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 공
## 제 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 할
## 인제도 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 개
## 선 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 다
## 자녀가구 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 취
## 약 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 할
## 인 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 유
## 지확대합니다 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 산
## 업통상자원부 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 고
## 양이 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 다
## 양 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 유
## 전 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 마
## 지막 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 파
## 일 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 제
## 작 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 자
## 체 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 퍼
## 센트 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 심
## 장 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 근
## 친 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 상
## 상 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 무
## 엇 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 정
## 치인 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 소
## 액 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 번
## 식 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 실
## 수 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 병
## 행 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 마
## 다 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 정
## 파 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 선
## 행 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 판
## 단기준 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 흥
## 미 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 프
## 젝트 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 삭
## 제 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 계
## 속 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 픽
## 사가 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 토
## 이스토리 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 네
## 트워크 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 드
## 라이버 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 원
## 본 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 명
## 령어 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 입
## 력 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 정
## 도 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 국
## 회 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 상
## 태 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 누
## 구 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 혐
## 오 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 상
## 업적 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 가
## 부장적 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 공
## 간 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 행
## 위 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 형
## 태 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 이
## 해 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 도
## 움 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 반
## 사회적 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 발
## 견 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 영
## 상 could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = Kdata_word, scale = c(5, 0.5), min.freq = 30, : 동
## 성애자 could not be fit on page. It will not be plotted.

# 감정분석 

senti_words_kr <- readr::read_delim("https://raw.githubusercontent.com/park1200656/KnuSentiLex/master/SentiWord_Dict.txt", delim='\t', col_names=c("term", "score"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   term = col_character(),
##   score = col_double()
## )
## Warning: 1 parsing failure.
##   row col  expected    actual                                                                                  file
## 14851  -- 2 columns 1 columns 'https://raw.githubusercontent.com/park1200656/KnuSentiLex/master/SentiWord_Dict.txt'
head(senti_words_kr)
## # A tibble: 6 x 2
##   term  score
##   <chr> <dbl>
## 1 (-;       1
## 2 (;_;)    -1
## 3 (^^)      1
## 4 (^-^)     1
## 5 (^^*      1
## 6 (^_^)     1
x <- duplicated(senti_words_kr$term)
senti_words_kr2 <- senti_words_kr[!x, ]
senti_dic_kr <- SentimentDictionaryWeighted(words = senti_words_kr2$term, 
                                            scores = senti_words_kr2$score)
senti_dic_kr <- SentimentDictionary(senti_words_kr2$term[senti_words_kr2$score > 0], 
                                    senti_words_kr2$term[senti_words_kr2$score < 0])

res_sentiment <- analyzeSentiment(Kdata_word,
                                  language="korean",
                                  rules=list("KoreanSentiment"=list(ruleSentiment, senti_dic_kr)),
                                  removeStopwords = F, stemming = F)
theme_set(theme_minimal(base_family = "AppleGothic"))

df <- data.frame(res_sentiment, Kdata_word)
df <- df[!(df$Kdata_word==""),]

df[is.na(df$KoreanSentiment),1] <- "neutral"
df[df$KoreanSentiment == 0,1] <- "neutral"
df[df$KoreanSentiment == 1,1] <- "positive"
df[df$KoreanSentiment == 0.125,1] <- "positive"
df[df$KoreanSentiment == 0.111111111111111,1] <- "positive"
df[df$KoreanSentiment == 0.0833333333333333,1] <- "positive"
df[df$KoreanSentiment < 0,1] <- "negative"

df_final <- df[!(df$KoreanSentiment=="neutral"),] #중립이 압도적으로 많아 빼고 시각화

ggplot(df_final, aes(x = KoreanSentiment)) + 
  geom_bar(stat = "count", width = 0.7, fill = "orange") + 
  theme_minimal() + ggtitle("tweets about childbirth in korea")

# ===================================================


#childcare_k <- enc2utf8("육아")
#Kdata2 <- search_tweets(childcare_k, n=10000, lang="ko")
#Kdata2 <- unique(Kdata2)
#Kdata2 <- Kdata2[,1:13]
#write_csv(Kdata2, "육아.csv")

Kdata2 <- read.csv("https://raw.githubusercontent.com/jkworldchampion/Data_analytics/main/final_report/data_set/%E1%84%8B%E1%85%B2%E1%86%A8%E1%84%8B%E1%85%A1.csv")

Kdata_text2 <- Kdata2$text

Kdata_text2 <- sapply(Kdata_text2, extractNoun, USE.NAMES = F)
## Warning in value[[3L]](cond): can't processing '전에 트위터에서 어떤 분이 로판
## 육아물 보고 한국 여자들의 욕망이 다른 것도 아니고 어릴 때부터 가족에게 사랑 받는
## 것이란 게 너무 슬프다고 하신게 잊혀지지 않음 🤧'.
## Warning in value[[3L]](cond): can't processing '오늘은 Father’s Day인데 어휴 
## 솔직히 내 남편 같은 아빠들에겐 이런 특별한 날이 뭐가 필요있나 싶네 너는 맨날 파더스데이잖아 
## 애 보라 하면 하고 싶은 거 다 하면서 애 보는데 육아는 모름지기 내가 하고 싶은 걸 할 수 없음이거늘..^^^ 시댁 전화 좀 하랬더니 세월아 네월아.. 알아서 해라😐'.
## Warning in value[[3L]](cond): can't processing '애개육아는 정말 너무너무 어려워요
## 한손에 한손으로 카트 끌고 한손으로 또또 잡고 걷다 도랑으로 바퀴가 빠져 애기를 냅다 쏟아 얼굴로 화단에 그대로 자빠지게 만든 나쁜엄마😓'.
## Warning in value[[3L]](cond): can't processing '육아물이랑 임신수.... 모멘트 따
## 로 만들까🤔'.
## Warning in value[[3L]](cond): can't processing '내가 육아물에 환장하는 거 어캐 알고.
## 탐라에 육아물이 가득 🤤🤤🤤'.
## Warning in value[[3L]](cond): can't processing '@Let_me__ 아하하하하ㅏ하하하핳ㅎ
## 🤣🤣🤣🤣🤣🤣 아 렛미님네 아가 명언제조기같아요ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ아버
## 지 드디어 핵매운맛육아를 맛보셨군요… 저는 (이러면 안되지만) 괜히 같이 고소하구요
## 😏ㅋㅋㅋㅋㅋㅋㅋㅋ 그래도 마지막 마무리 가족 다같이 깔깔 웃으며 끝나니 저까지 넘
## 행복해집니다🥰'.
## Warning in value[[3L]](cond): can't processing '전에 트위터에서 어떤 분이 로판
## 육아물 보고 한국 여자들의 욕망이 다른 것도 아니고 어릴 때부터 가족에게 사랑 받는
## 것이란 게 너무 슬프다고 하신게 잊혀지지 않음 🤧'.
## Warning in value[[3L]](cond): can't processing '모랄까 육아 첨 시작할때 아기한테
## 항상 웃고 리액션 해주고 좋은 말만 해주고 그런게 너무 스트레스였는데 감정노동 3년
## 이 넘어 판데믹에 가정보육으로 잘때 빼고 내 얼굴, 내 표정, 내 기분 되찾기가 불가
## 능 해지니 이게 쌓이고 쌓여 결국 터진 것 같다. 근데 해결법도 없는 일 아닌지🤔'.
## Warning in value[[3L]](cond): can't processing '살인자의 기억법
## 이제 보는 중인데
## 소설 바탕으로 만든 영화라는 게 정말 느껴짐
## 살인이 시라면 육아는 산문이다&lt;이게 제일 인상 깊었던 
## 중간중간 독백도 소름 돋고 웃김 
## 욕이 대부분이라 적나라하긴 하지만 
## 배우가 적절하게 잘 살린 기분
## 책으로도 한 번 읽어봐야지🥰'.
## Warning in value[[3L]](cond): can't processing '@tianshi_k 진짜 육아방식에 차이
## 있어서 더 실감나요ㅋㅋㅋㅋㅋㅋㅋ꽤나 하나하나 가르치고 싶어하는 쿥과 애 하고싶은
## 대로 우선 놔둬봐 하는 탡...😌😌😌'.
## Warning in value[[3L]](cond): can't processing '아 ㅜㅜㅜㅜ 저분 진짜 아카자 육
## 아하시는듯 ㅜㅜㅜㅜㅜㅜㅜ🥺🥺🥺🥺'.
## Warning in value[[3L]](cond): can't processing 'https://t.co/SglVvKbgtB 
## 정민이 육아하는 둘입니다😎'.
## Warning in value[[3L]](cond): can't processing '@somuchforea 육아를 겸한 힐링 아
## 닌 힐링..🙃'.
## Warning in value[[3L]](cond): can't processing '58. 무라사키 넘 좋다 ㅋㅋㅋㅋㅋㅋㅋㅋ
## 애들을 별로 안좋아하는 것 치고는 참된 교육자인데? 
## ...허윤무라 응원합니다...... 육아난이도 최상레벨 헬 육아네......😱💦'.
## Warning in value[[3L]](cond): can't processing '@wood_stock55 신혼생활 즌2 다음
## 엔 육아일기 즌3까지 내줘요오오오옹😆😆'.
## Warning in value[[3L]](cond): can't processing '@wood_stock55 당연 신혼생활 다음
## 이 육아일기가 진리 아뉘게쪄요오오옹😏😏😏😏😏😏'.
## Warning in value[[3L]](cond): can't processing '요즘 육아 브이로그를 너무 많이
## 봤더니 😅'.
## Warning in value[[3L]](cond): can't processing '아기 간병하느라 잠못자고
## 더운데 에어컨 없이 버텨서 그런가
## 점심때 시킨 이삭토스트 반쪽먹고
## 냉장고에 넣어뒀는데
## 저녁을 남은 반쪽으로 해결
## 와 애둘 독박육아하며
## 하루종일 토스트 하나로 버티다니
## 셀프 서프라이즈😯'.
## Warning in value[[3L]](cond): can't processing '@flowersfordawn 좋아보였다니 더
## 욕심 나네요 ㅎㅎㅎ 육아하는 집에 빈 공간이 남아날리가요 😂😂'.
## Warning in value[[3L]](cond): can't processing '투자보다 육아가 더 힘들다..
## 
## 투자로 강해진멘탈 육아로 깨지는중🙃'.
## Warning in value[[3L]](cond): can't processing '@nanabicos 아니 아가들이 진짜 천
## 사보다 더 천사같이 아름다워요ㅠㅠ 분홍색 유모차에 있는 아가는 진짜 너무 귀엽고
## 옆에 있는 아가는 사랑스러워요ㅠㅠㅠ 어떻게 이런 아가들이 태어났는지ㅠㅠ 너무 행
## 복해요ㅠ 육아는 무지무지 힘들다고 들었어요! 나나님은 정말 대단하신 것 같구요ㅠㅠ
## ㅠ🥰🥰🥰'.
## Warning in value[[3L]](cond): can't processing '@egan_shin 황숙… 육아에 밤낮이
## 없겠죠… 😭 황숙… 얼렁 돌아오셔서 옛날에 나꼼수 쫓아댕기던거 회개하시는 튓 올려주
## 셔야하는데… 😩😏😏'.
## Warning in value[[3L]](cond): can't processing '@eunZY5124_YUsk 이제 육아하러 가
## 야해서 술은....😭'.
## Warning in value[[3L]](cond): can't processing '@chobeige 육아와 살림을 남편분이
## 엄청 잘 도와주시겠거니 짐작을 해 봅니다... 그리고 일단 젊으니까!! ㅎㅎ 전 두어시
## 간 뜨개 하면 누워야 해요 🥲'.
## Warning in value[[3L]](cond): can't processing '@akzkfh0 에고ㅠㅠ 육아 정말 너무
## 위대하고 대단한것 같아요ㅠㅠㅠㅠ 애기 얼른 맘마먹고 보채지말고 푹 잤으면 좋겠어
## 요. 부토님도 같이 푹 주무시구요☺️🙏🏻'.
## Warning in value[[3L]](cond): can't processing '가짜육아도 조음…. 남의 애… 식물…
## 강강쥬 고앵쓰 머든…🥲'.
## Warning in value[[3L]](cond): can't processing '지민이는 그냥 뭐랄까 보듬어주고싶고 사랑해주고싶음.. 이런게 유사육아인가??ㅋㅋㅋㅋㅋ 농담이지만 가끔 버블보다보면 그런느낌 들때도 있긴함..ㅎ 아직 아이같은면도 있고.. 근데 또 언니고 리더라고 먼저 나설때는 또 애틋하달까..? 쓰다보니 진짜 육아같네🤔
## 아무튼 얘가 행복했으면 좋겠음🥺🙏'.
## Warning in value[[3L]](cond): can't processing '@yeppue_009s ㅋㅋㅋ오히려 그런
## 육아 방식은 렌디아가(엄마한테 배웠다는)가 고집할 것 같아요! 부드러운 육아방식은
## 스모커가 이끌고😉'.
## Warning in value[[3L]](cond): can't processing '생각해보니까 렌디아는 수룡들의
## 육아방식을 고집하는데 스모커는 인간 육아방식을 주장해서 싸울 것 같기도 하네🤔'.
## Warning in value[[3L]](cond): can't processing '다들 렌디아랑 스모커 우당탕 육아
## 일기를 좋아하시는 군요..나중에 타래로 풀어봐야지🤔'.
## Warning in value[[3L]](cond): can't processing '@Mango_RDW ㅋㅋㅋ
## 엄격한 아빠랑 자상한 엄마가 아니고
## 엄격한 엄마랑 자상한 아빠 육아군요😁'.
## Warning in value[[3L]](cond): can't processing '@egan_shin 황숙… 육아에 밤낮이
## 없겠죠… 😭 황숙… 얼렁 돌아오셔서 옛날에 나꼼수 쫓아댕기던거 회개하시는 튓 올려주
## 셔야하는데… 😩😏😏'.
## Warning in value[[3L]](cond): can't processing '진짜 근데 지민정 룸메되는 날엔
## 정말 육아하게 될까봐....🤔'.
## Warning in value[[3L]](cond): can't processing '돈많으면 육아도 꿀잼인데 
## 오늘 꼬맹이와 둘이서 하루를 불태우며 쓴돈이...10만원돈이라네 우후후....
## 다음주도 열심히 일해야긋지 ...🤑'.
## Warning in value[[3L]](cond): can't processing '@fishtuna_01 필리핀가서 육아 좀
## 하다가 잘생긴 남자 만나서 찐한 연애를 한다면...🥲'.
## Warning in value[[3L]](cond): can't processing '@MICA_dixs 아닛ㅋㅋㅋㅋㅋ
## 육아 힘들지ㅋㅋㅋㅋ
## 그래도 그 댓가로 미카 예쁜모습 보니까 넘 좋다💙🖤'.
## Warning in value[[3L]](cond): can't processing '@MELO_HOLIC_ 냥이 육아일기 그르
## 네요~😁👍'.
## Warning in value[[3L]](cond): can't processing 'https://t.co/SglVvKbgtB 
## 정민이 육아하는 둘입니다😎'.

## Warning in value[[3L]](cond): can't processing 'https://t.co/SglVvKbgtB 
## 정민이 육아하는 둘입니다😎'.
## Warning in value[[3L]](cond): can't processing '@mminjji1 죠는 육아ㅠㅠ 흑흑 우
## 리 현생.. 아기야 얼른커서 같이 덕질하자🥲'.
## Warning in value[[3L]](cond): can't processing '주말에 푹 쉬고 
## 웡낫 육아물 짭근친물 호빠물
## 윤낫 센가물
## 
## 네... 벌려놓은 거 열심히 봉합해볼게요😋'.
## Warning in value[[3L]](cond): can't processing '@2zlet2 안녕하세요 이졸렛님! 항
## 상 따뜻한 말씀 감사합니다 🥲💕 저도 매번 이졸렛님 글 재탕하면서 신작 나오는 거
## 기다리고 있어요! 육아로 바쁘시겠지만 푹 쉬시고, 같이 오래오래 글 써주세요! 🥰'.
## Warning in value[[3L]](cond): can't processing '@manyeo_niel 쪼끄매ㅠㅠㅠㅠㅠ장
## 난감이랑 머리 크기랑 똑같아ㅠㅠㅠㅠㅠㅠ육아의 시름이 다 가시시겠어요ㅠㅠ보여주셔
## 서 고맙습니다💜'.
## Warning in value[[3L]](cond): can't processing '@manyeo_niel 크흡 그렇게 재는군요!!
## 핸드폰만한 춘식이ㅠㅠㅠ마침 제 손안에 폰이 있는데 요따만하다 이거져 으헤헤 춘시가 😍😍
## 마넬님 육아 고생많으시구요 파이팅이구요 춘식이 보여주셔서 감사합니다 자쥬 보여주십시오ㅠㅠ💜헷헤💜'.
## Warning in value[[3L]](cond): can't processing '육아로 바쁜고 아니냐묘🙄'.
## Warning in value[[3L]](cond): can't processing '육아au😆😆😆'.
## Warning in value[[3L]](cond): can't processing '@MOONbee_bw 전 🤣🤣 예전에 육아
## 일기 식으로 쭉 정리할려고 사진 저장은 해놨는데 친구들 다 가려야 해서 포기 했어요
## 그냥 쭉 페북 보면서 혼자 육아 일기를 🤣🤣'.
## Warning in value[[3L]](cond): can't processing '드러머 동원이 모아서 편집하려고 외장하드 열었더니 대박! 13살파이리애깅이부터 보니 진짜 엄청많이 큰게 느껴지네
## 성장하는 아티스트덕질 진짜오랜만인데 696일째 동원이덕에 행복해 이맛에 육아덕질 하지만~😄'.
## Warning in value[[3L]](cond): can't processing '@MariaKimLy3 @BTS_twt 고마워요! 😊
## 울 마리아님도 육아에 힘드실텐데...아자 아자 화이팅입니다. 더위에 지치지 않게 조심하시구요. 💪💜'.
## Warning in value[[3L]](cond): can't processing '@Je75704823 그러니까요. 이렇게
## 보라보라님께서 종종 올려주시는 글들을 읽고 많은 생각을 하게 하네요. 염려해 주셔
## 서 감사해요. 엄마 가시고 살림을 너무 본격적으로 했나봐요🤦🏻 아기도 점점 무거워지
## 기는데 아가는 저와 떨어지고 싶어하지 않으니😂 강제애착육아중이에여🤣🤣🤣 아끼도
## 록 노력해볼게요💜🙏'.
## Warning in value[[3L]](cond): can't processing '@onlyvote18 꺅! 눈마주치고 웃으며 나에게 오는 아가들이라니~ 
## 넘 귀여운데여😍
## 육아는 귀엽지 않지만 아가들은 넘 귀여워여~
## 월욜 휴가는 온전히 누리시길~~💜'.
## Warning in value[[3L]](cond): can't processing '거기.. 지나가는 리름밈
## 에이든도
## 육아썰
## 주
## 세
## 요
## 🥺🥺🥺'.
## Warning in value[[3L]](cond): can't processing '혜성이가 어린 목소리로 엄마아!
## 하는데 그래 내가 니 에미다 라고 해줘야 할 것 같은 그런 느낌마저 들 뻔했구요 오빠
## 들이 나보다 연상이라 넘나 다행이다 유사육아덕질을 실시간으로 하는 것만은 피할 수
## 있어서😇😇'.
## Warning in value[[3L]](cond): can't processing '@shining_Suga707 @BTS_twt 나가야
## 죠ㅋㅋ 하루씩 육아를💜'.
## Warning in value[[3L]](cond): can't processing '@_lim_jane ㅋㅋㅋㅋ이제 표정관리
## @연습도 해야하는 엄마들,, 할거 되게 많네요 🤣ㅋㅋㅋ 오늘도 육아하느라 고생 많으
## 셨어요 재인님! 일찍 아가들 재우구 육퇴하자그여💚'.
Kdata_word2 <- unlist(Kdata_text2)
Kdata_word2 <- Filter(function(x){nchar(x)>=2},Kdata_word2)

Kdata_word2 <- gsub("\n", "", Kdata_word2)
Kdata_word2 <- gsub("\r", "", Kdata_word2)
Kdata_word2 <- gsub("https://", "", Kdata_word2)
Kdata_word2 <- gsub("@", "", Kdata_word2)
Kdata_word2 <- gsub("[[:punct:]]", "", Kdata_word2)
Kdata_word2 <- str_replace_all(Kdata_word2,"[A-Za-z0-9]","")
Kdata_word2 <- gsub("[ㄱ-ㅎ]", "", Kdata_word2)
Kdata_word2 <- gsub("[ㅜ|ㅠ]", "", Kdata_word2)
Kdata_word2 <- gsub("로", "", Kdata_word2)
Kdata_word2 <- gsub("해서", "", Kdata_word2)
Kdata_word2 <- gsub("육아", "", Kdata_word2)
Kdata_word2 <- gsub("키선배님이", "", Kdata_word2)
Kdata_word2 <- gsub("놀토가", "", Kdata_word2)
Kdata_word2 <- gsub("호시는", "", Kdata_word2)
Kdata_word2 <- gsub("트니트니", "", Kdata_word2)
Kdata_word2 <- gsub("세븐", "", Kdata_word2)
Kdata_word2 <- gsub("그거", "", Kdata_word2)
Kdata_word2 <- gsub("갖다대니까", "", Kdata_word2)
Kdata_word2 <- gsub("하게", "", Kdata_word2)

set.seed(1234)
wordcloud(words=Kdata_word2, scale=c(4,0.5),
          min.freq = 40, random.order = F, max.words=200, family="AppleGothic", colors=pal)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

# 감정분석 
res_sentiment2 <- analyzeSentiment(Kdata_word2,
                                  language="korean",
                                  rules=list("KoreanSentiment"=list(ruleSentiment, senti_dic_kr)),
                                  removeStopwords = F, stemming = F)

df2 <- data.frame(res_sentiment2, Kdata_word2)
df2 <- df2[!(df2$Kdata_word2==""),]

df2[is.na(df2$KoreanSentiment),1] <- "neutral"
df2[df2$KoreanSentiment == 0,1] <- "neutral"
df2[df2$KoreanSentiment == 1,1] <- "positive"
df2[df2$KoreanSentiment == 0.0555555555555556,1] <- "positive"
df2[df2$KoreanSentiment == 0.333333333333333,1] <- "positive"
df2[df2$KoreanSentiment == 0.142857142857143,1] <- "positive"
df2[df2$KoreanSentiment == 0.1,1] <- "positive"
df2[df2$KoreanSentiment == 0.0833333333333333,1] <- "positive"
df2[df2$KoreanSentiment == 0.0769230769230769,1] <- "positive"
df2[df2$KoreanSentiment == 0.0588235294117647,1] <- "positive"
df2[df2$KoreanSentiment == 0.05,1] <- "positive"
df2[df2$KoreanSentiment < 0,1] <- "negative"
df2[df2$KoreanSentiment == 0.0714285714285714,1] <- "negative"

df2_final <- df2[!(df2$KoreanSentiment=="neutral"),]

ggplot(df2_final, aes(x = KoreanSentiment)) + 
  geom_bar(stat = "count", width = 0.7, fill = "aquamarine4") + 
  theme_minimal() + ggtitle("tweets about parenting in korea")

2.5 결론

분석방법으로 전진선택법과 Elasticnet방법을 사용해 보았다. 전진선택법은 AIC가 낮아지는 쪽을 선택하는 방법이다. 반면에 Elasticnet방법은 모델의 효율성도 생각하는 모델이다. 우리가 살펴본 수치상으론 전진선택법이 더 우수했다. 여기선 전진 선택법의 여러 지표들을 통합해서 알아보겠다.

tab_model(step_df, show.se = T, show.ci = F, show.stat= T, auto.label = F)
  Birth rate
Predictors Estimates std. Error Statistic p
(Intercept) -0.00 0.05 -0.00 1.000
old ratio -0.49 0.08 -5.85 <0.001
infant mortality rate 0.45 0.08 5.75 <0.001
industry employed -0.14 0.06 -2.55 0.013
gini_score -0.13 0.06 -2.13 0.037
Women suicide -0.08 0.06 -1.52 0.132
Observations 73
R2 / R2 adjusted 0.818 / 0.805
plot_model(step_df, sort.est = T, type = "est", wrap.labels=5)

3 OECD 회원국 분석

3.1 로지스틱 회귀분석

Country <- rownames(df_vif)
df_vif <- cbind(df_vif, Country)
df_vif <- left_join(df_vif, df_oecd, by='Country')
df_vif$OECD[is.na(df_vif$OECD)] <- 0  # OECD여부에 따라 맞으면1, 아니면0
rownames(df_vif) <- df_vif[,28]
df_vif <- df_vif[,-28] # Country 삭제
df_vif$OECD <- as.numeric(df_vif$OECD)

# 다중공선성에 의한 변수 삭제
vif_model <- lm(`OECD` ~ ., data = df_vif)
vif_value <- vif(vif_model) > 10
df_vif_modify <- df_vif[, !vif_value]

df_log_reg <- glm(`OECD` ~ ., family = binomial, data = df_vif_modify, maxit = 100) # 변수 초과에 의해 정답률이 1이 나온다.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(df_log_reg)

df_log_reg_Null = glm(OECD ~ 1, family = binomial, data = df_vif_modify, maxit = 100)
summary(df_log_reg_Null)

# anova(df_log_reg_Null, df_log_reg, test="LRT")

step_df_vif_log_reg <- step(df_log_reg_Null, scope=list(lower=df_log_reg_Null, upper=df_log_reg),
                            direction="forward")
summary(step_df_vif_log_reg)
## 
## Call:
## glm(formula = OECD ~ `gdp per capita` + `old ratio` + refugees + 
##     `happy score` + `threatened species` + `industry employed` + 
##     unemployed, family = binomial, data = df_vif_modify, maxit = 100)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.54586  -0.10218  -0.00471   0.04423   1.85841  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)           -1.0561     0.7486  -1.411   0.1583  
## `gdp per capita`       2.9799     1.6211   1.838   0.0660 .
## `old ratio`            1.6961     0.8029   2.113   0.0346 *
## refugees               2.5362     1.1460   2.213   0.0269 *
## `happy score`          3.9324     1.6149   2.435   0.0149 *
## `threatened species`   1.5092     0.7658   1.971   0.0487 *
## `industry employed`    1.9974     1.1339   1.762   0.0782 .
## unemployed             1.2554     0.9011   1.393   0.1636  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 99.536  on 72  degrees of freedom
## Residual deviance: 24.419  on 65  degrees of freedom
## AIC: 40.419
## 
## Number of Fisher Scoring iterations: 8
exp(coef(step_df_vif_log_reg))      # 계수를 살펴본다 
##          (Intercept)     `gdp per capita`          `old ratio` 
##            0.3478213           19.6852215            5.4528847 
##             refugees        `happy score` `threatened species` 
##           12.6318049           51.0281438            4.5233084 
##  `industry employed`           unemployed 
##            7.3697166            3.5091747
vif(step_df_vif_log_reg)            # 다중공선성 확인
##     `gdp per capita`          `old ratio`             refugees 
##             2.493158             2.011996             2.735693 
##        `happy score` `threatened species`  `industry employed` 
##             3.936967             1.932457             3.257611 
##           unemployed 
##             3.854877

3.1.1 모델 평가

df_log_reg_graph <- ROC(form = `OECD` ~ `old ratio`+
                          `infant mortality rate`+
                          `industry employed`+
                          `gini_score`+
                          `Women suicide`,
                        data = df_vif_modify,
                        plot = "ROC")

df_log_reg_graph$res[round(df_log_reg_graph$res$lr.eta,3) == 0.674,]
##                        sens      spec       pvp       pvn    lr.eta
## 0.673599504785467 0.8387097 0.9285714 0.1136364 0.1034483 0.6735995
df_log_reg_graph$AUC 
## [1] 0.9162826
df_log_reg_graph$lr
## 
## Call:  glm(formula = form, family = binomial, data = data)
## 
## Coefficients:
##             (Intercept)              `old ratio`  `infant mortality rate`  
##                -1.89486                  0.86574                 -4.21271  
##     `industry employed`               gini_score          `Women suicide`  
##                -0.02506                  0.96928                  0.07943  
## 
## Degrees of Freedom: 72 Total (i.e. Null);  67 Residual
## Null Deviance:       99.54 
## Residual Deviance: 54.49     AIC: 66.49
# caret
confusionMatrix(
  as.factor(ifelse(predict(step_df_vif_log_reg, type = "response")>0.674,1,0)),
  as.factor(df_log_reg$y),
  positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 41  4
##          1  1 27
##                                           
##                Accuracy : 0.9315          
##                  95% CI : (0.8474, 0.9774)
##     No Information Rate : 0.5753          
##     P-Value [Acc > NIR] : 0.00000000001086
##                                           
##                   Kappa : 0.858           
##                                           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.8710          
##             Specificity : 0.9762          
##          Pos Pred Value : 0.9643          
##          Neg Pred Value : 0.9111          
##              Prevalence : 0.4247          
##          Detection Rate : 0.3699          
##    Detection Prevalence : 0.3836          
##       Balanced Accuracy : 0.9236          
##                                           
##        'Positive' Class : 1               
## 
F1_score <- 2*(0.9643*0.8710)/(0.9643+0.8710)
F1_score
## [1] 0.9152785

3.2 나이브 베이즈

# 그룹화 함수
mygroup = function (y,k=4){
  count = length(y)
  z = rank(y,ties.method = "min")
  return(floor((z-1)/(count/k))+1)
}
df_vif_factor <- data.frame()
df_vif_factor <- lapply(df_vif, function(x){mygroup(x,10)}) # 각각 10개씩 그룹으로 나눈다.
df_vif_factor <- as.data.frame(df_vif_factor)

df_vif_factor$OECD <- ifelse(df_vif_factor$OECD == '6', 1, 0)  # 종속도 변하기 때문에 다시 바꿔준다

# 데이터 분할
train_idx = sample(73, 73*3/4)      
df_train = df_vif_factor[train_idx,]
df_test = df_vif_factor[-train_idx,]
df_train_labels = df_vif_factor[train_idx,]$OECD
df_test_labels = df_vif_factor[-train_idx,]$OECD


df_train = df_train[,-length(df_train)] # OECD 열을 삭제

prop.table(table(df_train_labels))
## df_train_labels
##         0         1 
## 0.5925926 0.4074074
prop.table(table(df_test_labels))
## df_test_labels
##         0         1 
## 0.5263158 0.4736842
# 비슷하다.

# 모델 생성
df_classifier = naiveBayes(df_train, df_train_labels)

3.2.1 모델 평가

df_test_pred = predict(df_classifier, df_test)

CrossTable(df_test_pred, df_test_labels,
           prop.chisq = F, prop.c = F, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
##    Cell Contents 
## |-------------------------|
## |                       N | 
## |         N / Table Total | 
## |-------------------------|
## 
## ==================================
##              actual
## predicted        0       1   Total
## ----------------------------------
## 0                9       4      13
##              0.474   0.211        
## ----------------------------------
## 1                1       5       6
##              0.053   0.263        
## ----------------------------------
## Total           10       9      19
## ==================================
confusionMatrix(as.factor(df_test_pred),
                as.factor(df_test_labels), 
                positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 9 4
##          1 1 5
##                                          
##                Accuracy : 0.7368         
##                  95% CI : (0.488, 0.9085)
##     No Information Rate : 0.5263         
##     P-Value [Acc > NIR] : 0.05192        
##                                          
##                   Kappa : 0.4633         
##                                          
##  Mcnemar's Test P-Value : 0.37109        
##                                          
##             Sensitivity : 0.5556         
##             Specificity : 0.9000         
##          Pos Pred Value : 0.8333         
##          Neg Pred Value : 0.6923         
##              Prevalence : 0.4737         
##          Detection Rate : 0.2632         
##    Detection Prevalence : 0.3158         
##       Balanced Accuracy : 0.7278         
##                                          
##        'Positive' Class : 1              
## 
# 모델 개선형
df_classifier_modify <- naiveBayes(df_train, df_train_labels, laplace = 1)
df_test_pred_modify = predict(df_classifier_modify, df_test)

CrossTable(df_test_pred_modify, df_test_labels,
           prop.chisq = F, prop.c = F, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
##    Cell Contents 
## |-------------------------|
## |                       N | 
## |         N / Table Total | 
## |-------------------------|
## 
## ==================================
##              actual
## predicted        0       1   Total
## ----------------------------------
## 0                9       4      13
##              0.474   0.211        
## ----------------------------------
## 1                1       5       6
##              0.053   0.263        
## ----------------------------------
## Total           10       9      19
## ==================================
confusionMatrix(as.factor(df_test_pred_modify),
                as.factor(df_test_labels), 
                positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 9 4
##          1 1 5
##                                          
##                Accuracy : 0.7368         
##                  95% CI : (0.488, 0.9085)
##     No Information Rate : 0.5263         
##     P-Value [Acc > NIR] : 0.05192        
##                                          
##                   Kappa : 0.4633         
##                                          
##  Mcnemar's Test P-Value : 0.37109        
##                                          
##             Sensitivity : 0.5556         
##             Specificity : 0.9000         
##          Pos Pred Value : 0.8333         
##          Neg Pred Value : 0.6923         
##              Prevalence : 0.4737         
##          Detection Rate : 0.2632         
##    Detection Prevalence : 0.3158         
##       Balanced Accuracy : 0.7278         
##                                          
##        'Positive' Class : 1              
## 

결론